/*******************************************************************************
* Copyright 2020 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "cpu/x64/jit_generator.hpp"

#include "cpu/x64/gemm/s8x8s32/common_u8.hpp"

namespace dnnl {
namespace impl {
namespace cpu {
namespace x64 {

jit_sse41_kernel_b0_gemm_s8u8s32_kern::jit_sse41_kernel_b0_gemm_s8u8s32_kern()
    : jit_generator(nullptr, S8U8S32_COMPUTE_KERNEL_CODE_SIZE) {}

void jit_sse41_kernel_b0_gemm_s8u8s32_kern::generate() {

#ifndef _WIN32

#define M rdi
#define N rsi
#define K rdx
#define A r8
#define B r9
#define C r10
#define LDC r11

#define AA rcx
#define I r12
#define J r13
#define H rax
#define AO r14
#define BO r15
#define CO1 rbx
#define CO2 rbp

#else

#define M rcx
#define N rdx
#define K r8
#define A rsi
#define B r9
#define C r10
#define LDC r11

#define AA rdi
#define I r12
#define J r13
#define H rax
#define AO r14
#define BO r15
#define CO1 rbx
#define CO2 rbp

#endif

#ifdef _WIN32
#define ARG_A (args_offset - 16) + rsp
#define ARG_B (args_offset - 8) + rsp
#endif
#define ARG_C ((args_offset + 0) + rsp)
#define ARG_LDC ((args_offset + 8) + rsp)

    inLocalLabel();
    {

        Xbyak::Label l1020;
        Xbyak::Label l1058;
        Xbyak::Label l10b0;
        Xbyak::Label l1118;
        Xbyak::Label l1130;
        Xbyak::Label l1134;
        Xbyak::Label l116c;
        Xbyak::Label l1194;
        Xbyak::Label l1220;
        Xbyak::Label l1234;
        Xbyak::Label l12c0;
        Xbyak::Label l12fc;
        Xbyak::Label l1350;
        Xbyak::Label l13b0;
        Xbyak::Label l13d8;
        Xbyak::Label l1408;
        Xbyak::Label l146c;
        Xbyak::Label l1478;
        Xbyak::Label l14dc;
        Xbyak::Label l1504;
        Xbyak::Label l1544;
        Xbyak::Label l1590;
        Xbyak::Label l15a0;
        Xbyak::Label l15a4;
        Xbyak::Label l15dc;
        Xbyak::Label l1604;
        Xbyak::Label l1690;
        Xbyak::Label l16a4;
        Xbyak::Label l1730;
        Xbyak::Label l176c;
        Xbyak::Label l17c0;
        Xbyak::Label l1820;
        Xbyak::Label l1844;
        Xbyak::Label l1874;
        Xbyak::Label l18d8;
        Xbyak::Label l18e4;
        Xbyak::Label l1948;
        Xbyak::Label l1970;
        Xbyak::Label l19b0;
        Xbyak::Label l19fc;
        Xbyak::Label l1a08;
        Xbyak::Label l1a0c;
        Xbyak::Label l1a44;
        Xbyak::Label l1a6c;
        Xbyak::Label l1af8;
        Xbyak::Label l1b0c;
        Xbyak::Label l1b98;
        Xbyak::Label l1bd4;
        Xbyak::Label l1c28;
        Xbyak::Label l1c88;
        Xbyak::Label l1cb0;
        Xbyak::Label l1ce0;
        Xbyak::Label l1d44;
        Xbyak::Label l1d50;
        Xbyak::Label l1db4;
        Xbyak::Label l1ddc;
        Xbyak::Label l1e1c;
        Xbyak::Label l1e68;
        Xbyak::Label l1e78;
        Xbyak::Label l1e7c;
        Xbyak::Label l26c;
        Xbyak::Label l280;
        Xbyak::Label l400;
        Xbyak::Label l4a4;
        Xbyak::Label l578;
        Xbyak::Label l668;
        Xbyak::Label l6cc;
        Xbyak::Label l714;
        Xbyak::Label l808;
        Xbyak::Label l818;
        Xbyak::Label l8c;
        Xbyak::Label l90c;
        Xbyak::Label l968;
        Xbyak::Label l9f0;
        Xbyak::Label la98;
        Xbyak::Label lac4;
        Xbyak::Label lad8;
        Xbyak::Label lb10;
        Xbyak::Label lb3c;
        Xbyak::Label lb4;
        Xbyak::Label lc18;
        Xbyak::Label lc2c;
        Xbyak::Label ld08;
        Xbyak::Label ld64;
        Xbyak::Label lde0;
        Xbyak::Label le70;
        Xbyak::Label leac;
        Xbyak::Label lec;
        Xbyak::Label lee8;
        Xbyak::Label lf7c;
        Xbyak::Label lf8c;

        auto stack_alloc_size = 32;
        auto args_offset = stack_alloc_size + get_size_of_abi_save_regs() + 8;
#ifdef _WIN32
        args_offset += 48;
#endif
        preamble();
        sub(rsp, stack_alloc_size);
#ifdef _WIN32
        mov(A, ptr[ARG_A]);
        mov(B, ptr[ARG_B]);
#endif

        mov(C, qword[ARG_C]);
        mov(LDC, qword[ARG_LDC]);
        sub(A, -128);
        sub(B, -128);
        mov(M, qword[M]);
        mov(N, qword[N]);
        mov(K, qword[K]);
        lea(LDC, ptr[LDC * 4 + 0x0]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        xorps(xmm12, xmm12);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(H, 0x10001);
        movq(xmm7, H);
        pshufd(xmm7, xmm7, 0x0);
        mov(J, M);
        cmp(J, 0x10);
        jl(lad8, T_NEAR);
        align(4);

        L(l8c);
        mov(CO1, C);
        add(C, 0x40);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x20);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(I, N);
        cmp(I, 0x2);
        jl(l6cc, T_NEAR);
        align(4);

        L(lb4);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm1, xword[AO - 0x70]);
        movdqu(xmm2, xword[AO - 0x60]);
        movdqu(xmm3, xword[AO - 0x50]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l400, T_NEAR);
        sub(H, 0x8);
        jle(l26c, T_NEAR);
        align(4);

        L(lec);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        movdqu(xmm2, xword[AO - 0x20]);
        movdqu(xmm3, xword[AO - 0x10]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO]);
        movdqu(xmm1, xword[AO + 0x10]);
        movdqu(xmm2, xword[AO + 0x20]);
        movdqu(xmm3, xword[AO + 0x30]);
        add(AA, 0x4);
        add(AO, 0x80);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(lec, T_NEAR);
        align(4);

        L(l26c);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(l400, T_NEAR);
        align(4);

        L(l280);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        movdqu(xmm2, xword[AO - 0x20]);
        movdqu(xmm3, xword[AO - 0x10]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO]);
        movdqu(xmm1, xword[AO + 0x10]);
        movdqu(xmm2, xword[AO + 0x20]);
        movdqu(xmm3, xword[AO + 0x30]);
        add(AA, 0x4);
        add(AO, 0x80);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l280, T_NEAR);
        align(4);

        L(l400);
        mov(H, K);
        test(H, 0x4);
        je(l4a4, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        add(AO, 0x40);
        add(BO, 0x8);
        align(4);

        L(l4a4);
        mov(H, K);
        test(H, 0x2);
        je(l578, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        punpckhwd(xmm1, xmm6);
        movdqu(xmm2, xword[AO - 0x70]);
        movaps(xmm3, xmm2);
        punpcklwd(xmm2, xmm6);
        punpckhwd(xmm3, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        add(AO, 0x20);
        add(BO, 0x4);
        align(4);

        L(l578);
        mov(H, K);
        test(H, 0x1);
        je(l668, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        pshufd(xmm1, xmm3, 0x55);
        punpcklbw(xmm1, xmm6);
        punpcklwd(xmm1, xmm6);
        pshufd(xmm2, xmm3, 0xaa);
        punpcklbw(xmm2, xmm6);
        punpcklwd(xmm2, xmm6);
        pshufd(xmm3, xmm3, 0xff);
        punpcklbw(xmm3, xmm6);
        punpcklwd(xmm3, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        add(AO, 0x10);
        add(BO, 0x2);
        align(4);

        L(l668);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + 0x10], xmm10);
        xorps(xmm10, xmm10);
        movdqu(xword[CO1 + 0x20], xmm12);
        xorps(xmm12, xmm12);
        movdqu(xword[CO1 + 0x30], xmm14);
        xorps(xmm14, xmm14);
        movdqu(xword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        movdqu(xword[CO1 + LDC * 1 + 0x10], xmm11);
        xorps(xmm11, xmm11);
        movdqu(xword[CO1 + LDC * 1 + 0x20], xmm13);
        xorps(xmm13, xmm13);
        movdqu(xword[CO1 + LDC * 1 + 0x30], xmm15);
        xorps(xmm15, xmm15);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(lb4, T_NEAR);
        align(4);

        L(l6cc);
        test(I, 0x1);
        jle(lac4, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm1, xword[AO - 0x70]);
        movdqu(xmm2, xword[AO - 0x60]);
        movdqu(xmm3, xword[AO - 0x50]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l90c, T_NEAR);
        sub(H, 0x8);
        jle(l808, T_NEAR);
        align(4);

        L(l714);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        movdqu(xmm2, xword[AO - 0x20]);
        movdqu(xmm3, xword[AO - 0x10]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO]);
        movdqu(xmm1, xword[AO + 0x10]);
        movdqu(xmm2, xword[AO + 0x20]);
        movdqu(xmm3, xword[AO + 0x30]);
        add(AA, 0x4);
        add(AO, 0x80);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l714, T_NEAR);
        align(4);

        L(l808);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l90c, T_NEAR);
        align(4);

        L(l818);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        movdqu(xmm2, xword[AO - 0x20]);
        movdqu(xmm3, xword[AO - 0x10]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO]);
        movdqu(xmm1, xword[AO + 0x10]);
        movdqu(xmm2, xword[AO + 0x20]);
        movdqu(xmm3, xword[AO + 0x30]);
        add(AA, 0x4);
        add(AO, 0x80);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l818, T_NEAR);
        align(4);

        L(l90c);
        mov(H, K);
        test(H, 0x4);
        je(l968, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        add(AO, 0x40);
        add(BO, 0x4);
        align(4);

        L(l968);
        mov(H, K);
        test(H, 0x2);
        je(l9f0, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        punpckhwd(xmm1, xmm6);
        movdqu(xmm2, xword[AO - 0x70]);
        movaps(xmm3, xmm2);
        punpcklwd(xmm2, xmm6);
        punpckhwd(xmm3, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        add(AO, 0x20);
        add(BO, 0x2);
        align(4);

        L(l9f0);
        mov(H, K);
        test(H, 0x1);
        je(la98, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        pshufd(xmm1, xmm3, 0x55);
        punpcklbw(xmm1, xmm6);
        punpcklwd(xmm1, xmm6);
        pshufd(xmm2, xmm3, 0xaa);
        punpcklbw(xmm2, xmm6);
        punpcklwd(xmm2, xmm6);
        pshufd(xmm3, xmm3, 0xff);
        punpcklbw(xmm3, xmm6);
        punpcklwd(xmm3, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        add(AO, 0x10);
        add(BO, 0x1);
        align(4);

        L(la98);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + 0x10], xmm10);
        xorps(xmm10, xmm10);
        movdqu(xword[CO1 + 0x20], xmm12);
        xorps(xmm12, xmm12);
        movdqu(xword[CO1 + 0x30], xmm14);
        xorps(xmm14, xmm14);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(lac4);
        mov(A, AO);
        sub(J, 0x10);
        cmp(J, 0x10);
        jge(l8c, T_NEAR);
        align(4);

        L(lad8);
        test(J, 0x8);
        jle(l1134, T_NEAR);
        mov(CO1, C);
        add(C, 0x20);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x10);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(I, N);
        cmp(I, 0x2);
        jl(leac, T_NEAR);
        align(4);

        L(lb10);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm1, xword[AO - 0x70]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(ld08, T_NEAR);
        sub(H, 0x8);
        jle(lc18, T_NEAR);
        align(4);

        L(lb3c);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        movdqu(xmm0, xword[AO - 0x60]);
        movdqu(xmm1, xword[AO - 0x50]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        add(AA, 0x4);
        add(AO, 0x40);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(lb3c, T_NEAR);
        align(4);

        L(lc18);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(ld08, T_NEAR);
        align(4);

        L(lc2c);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        movdqu(xmm0, xword[AO - 0x60]);
        movdqu(xmm1, xword[AO - 0x50]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        add(AA, 0x4);
        add(AO, 0x40);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(lc2c, T_NEAR);
        align(4);

        L(ld08);
        mov(H, K);
        test(H, 0x4);
        je(ld64, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        add(AO, 0x20);
        add(BO, 0x8);
        align(4);

        L(ld64);
        mov(H, K);
        test(H, 0x2);
        je(lde0, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        punpckhwd(xmm1, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        add(AO, 0x10);
        add(BO, 0x4);
        align(4);

        L(lde0);
        mov(H, K);
        test(H, 0x1);
        je(le70, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        pshufd(xmm1, xmm3, 0x55);
        punpcklbw(xmm1, xmm6);
        punpcklwd(xmm1, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        add(AO, 0x8);
        add(BO, 0x2);
        align(4);

        L(le70);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + 0x10], xmm10);
        xorps(xmm10, xmm10);
        movdqu(xword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        movdqu(xword[CO1 + LDC * 1 + 0x10], xmm11);
        xorps(xmm11, xmm11);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(lb10, T_NEAR);
        align(4);

        L(leac);
        test(I, 0x1);
        jle(l1130, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm1, xword[AO - 0x70]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l1020, T_NEAR);
        sub(H, 0x8);
        jle(lf7c, T_NEAR);
        align(4);

        L(lee8);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x60]);
        movdqu(xmm1, xword[AO - 0x50]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        add(AA, 0x4);
        add(AO, 0x40);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(lee8, T_NEAR);
        align(4);

        L(lf7c);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l1020, T_NEAR);
        align(4);

        L(lf8c);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x60]);
        movdqu(xmm1, xword[AO - 0x50]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        add(AA, 0x4);
        add(AO, 0x40);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(lf8c, T_NEAR);
        align(4);

        L(l1020);
        mov(H, K);
        test(H, 0x4);
        je(l1058, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        add(AO, 0x20);
        add(BO, 0x4);
        align(4);

        L(l1058);
        mov(H, K);
        test(H, 0x2);
        je(l10b0, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        punpckhwd(xmm1, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        add(AO, 0x10);
        add(BO, 0x2);
        align(4);

        L(l10b0);
        mov(H, K);
        test(H, 0x1);
        je(l1118, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        pshufd(xmm1, xmm3, 0x55);
        punpcklbw(xmm1, xmm6);
        punpcklwd(xmm1, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        add(AO, 0x8);
        add(BO, 0x1);
        align(4);

        L(l1118);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + 0x10], xmm10);
        xorps(xmm10, xmm10);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(l1130);
        mov(A, AO);
        align(4);

        L(l1134);
        test(J, 0x4);
        jle(l15a4, T_NEAR);
        mov(CO1, C);
        add(C, 0x10);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x8);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(I, N);
        cmp(I, 0x2);
        jl(l13d8, T_NEAR);
        align(4);

        L(l116c);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l12c0, T_NEAR);
        sub(H, 0x8);
        jle(l1220, T_NEAR);
        align(4);

        L(l1194);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x70]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x60]);
        add(AA, 0x4);
        add(AO, 0x20);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1194, T_NEAR);
        align(4);

        L(l1220);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(l12c0, T_NEAR);
        align(4);

        L(l1234);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x70]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x60]);
        add(AA, 0x4);
        add(AO, 0x20);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1234, T_NEAR);
        align(4);

        L(l12c0);
        mov(H, K);
        test(H, 0x4);
        je(l12fc, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x10);
        add(BO, 0x8);
        align(4);

        L(l12fc);
        mov(H, K);
        test(H, 0x2);
        je(l1350, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x8);
        add(BO, 0x4);
        align(4);

        L(l1350);
        mov(H, K);
        test(H, 0x1);
        je(l13b0, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x4);
        add(BO, 0x2);
        align(4);

        L(l13b0);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(l116c, T_NEAR);
        align(4);

        L(l13d8);
        test(I, 0x1);
        jle(l15a0, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l14dc, T_NEAR);
        sub(H, 0x8);
        jle(l146c, T_NEAR);
        align(4);

        L(l1408);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x70]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x60]);
        add(AA, 0x4);
        add(AO, 0x20);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1408, T_NEAR);
        align(4);

        L(l146c);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l14dc, T_NEAR);
        align(4);

        L(l1478);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x70]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x60]);
        add(AA, 0x4);
        add(AO, 0x20);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1478, T_NEAR);
        align(4);

        L(l14dc);
        mov(H, K);
        test(H, 0x4);
        je(l1504, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x10);
        add(BO, 0x4);
        align(4);

        L(l1504);
        mov(H, K);
        test(H, 0x2);
        je(l1544, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x8);
        add(BO, 0x2);
        align(4);

        L(l1544);
        mov(H, K);
        test(H, 0x1);
        je(l1590, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x4);
        add(BO, 0x1);
        align(4);

        L(l1590);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(l15a0);
        mov(A, AO);
        align(4);

        L(l15a4);
        test(J, 0x2);
        jle(l1a0c, T_NEAR);
        mov(CO1, C);
        add(C, 0x8);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x4);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(I, N);
        cmp(I, 0x2);
        jl(l1844, T_NEAR);
        align(4);

        L(l15dc);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l1730, T_NEAR);
        sub(H, 0x8);
        jle(l1690, T_NEAR);
        align(4);

        L(l1604);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x78]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x70]);
        add(AA, 0x4);
        add(AO, 0x10);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1604, T_NEAR);
        align(4);

        L(l1690);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(l1730, T_NEAR);
        align(4);

        L(l16a4);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x78]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x70]);
        add(AA, 0x4);
        add(AO, 0x10);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l16a4, T_NEAR);
        align(4);

        L(l1730);
        mov(H, K);
        test(H, 0x4);
        je(l176c, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x8);
        add(BO, 0x8);
        align(4);

        L(l176c);
        mov(H, K);
        test(H, 0x2);
        je(l17c0, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x4);
        add(BO, 0x4);
        align(4);

        L(l17c0);
        mov(H, K);
        test(H, 0x1);
        je(l1820, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x2);
        add(BO, 0x2);
        align(4);

        L(l1820);
        movlps(qword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movlps(qword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(l15dc, T_NEAR);
        align(4);

        L(l1844);
        test(I, 0x1);
        jle(l1a08, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l1948, T_NEAR);
        sub(H, 0x8);
        jle(l18d8, T_NEAR);
        align(4);

        L(l1874);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x78]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x70]);
        add(AA, 0x4);
        add(AO, 0x10);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1874, T_NEAR);
        align(4);

        L(l18d8);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l1948, T_NEAR);
        align(4);

        L(l18e4);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x78]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x70]);
        add(AA, 0x4);
        add(AO, 0x10);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l18e4, T_NEAR);
        align(4);

        L(l1948);
        mov(H, K);
        test(H, 0x4);
        je(l1970, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x8);
        add(BO, 0x4);
        align(4);

        L(l1970);
        mov(H, K);
        test(H, 0x2);
        je(l19b0, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x4);
        add(BO, 0x2);
        align(4);

        L(l19b0);
        mov(H, K);
        test(H, 0x1);
        je(l19fc, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x2);
        add(BO, 0x1);
        align(4);

        L(l19fc);
        movlps(qword[CO1], xmm8);
        xorps(xmm8, xmm8);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(l1a08);
        mov(A, AO);
        align(4);

        L(l1a0c);
        test(J, 0x1);
        jle(l1e7c, T_NEAR);
        mov(CO1, C);
        add(C, 0x4);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x2);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(I, N);
        cmp(I, 0x2);
        jl(l1cb0, T_NEAR);
        align(4);

        L(l1a44);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l1b98, T_NEAR);
        sub(H, 0x8);
        jle(l1af8, T_NEAR);
        align(4);

        L(l1a6c);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x7c]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x78]);
        add(AA, 0x4);
        add(AO, 0x8);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1a6c, T_NEAR);
        align(4);

        L(l1af8);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(l1b98, T_NEAR);
        align(4);

        L(l1b0c);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x7c]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x78]);
        add(AA, 0x4);
        add(AO, 0x8);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1b0c, T_NEAR);
        align(4);

        L(l1b98);
        mov(H, K);
        test(H, 0x4);
        je(l1bd4, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x4);
        add(BO, 0x8);
        align(4);

        L(l1bd4);
        mov(H, K);
        test(H, 0x2);
        je(l1c28, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x2);
        add(BO, 0x4);
        align(4);

        L(l1c28);
        mov(H, K);
        test(H, 0x1);
        je(l1c88, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x1);
        add(BO, 0x2);
        align(4);

        L(l1c88);
        movss(dword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movss(dword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(l1a44, T_NEAR);
        align(4);

        L(l1cb0);
        test(I, 0x1);
        jle(l1e78, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l1db4, T_NEAR);
        sub(H, 0x8);
        jle(l1d44, T_NEAR);
        align(4);

        L(l1ce0);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x7c]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x78]);
        add(AA, 0x4);
        add(AO, 0x8);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1ce0, T_NEAR);
        align(4);

        L(l1d44);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l1db4, T_NEAR);
        align(4);

        L(l1d50);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x7c]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x78]);
        add(AA, 0x4);
        add(AO, 0x8);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1d50, T_NEAR);
        align(4);

        L(l1db4);
        mov(H, K);
        test(H, 0x4);
        je(l1ddc, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x4);
        add(BO, 0x4);
        align(4);

        L(l1ddc);
        mov(H, K);
        test(H, 0x2);
        je(l1e1c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x2);
        add(BO, 0x2);
        align(4);

        L(l1e1c);
        mov(H, K);
        test(H, 0x1);
        je(l1e68, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x1);
        add(BO, 0x1);
        align(4);

        L(l1e68);
        movss(dword[CO1], xmm8);
        xorps(xmm8, xmm8);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(l1e78);
        mov(A, AO);
        align(4);

        L(l1e7c);
        add(rsp, stack_alloc_size);
        postamble();
    }
    outLocalLabel();

#undef M
#undef N
#undef K
#undef A
#undef B
#undef C
#undef LDC
#undef AA
#undef I
#undef J
#undef H
#undef AO
#undef BO
#undef CO1
#undef CO2
#ifdef _WIN32
#undef ARG_A
#undef ARG_B
#endif
#undef ARG_C
#undef ARG_LDC
}

} // namespace x64
} // namespace cpu
} // namespace impl
} // namespace dnnl
